library(tidyverse)
library(parallel)
library(digest)
library(readxl)

# Set the directory to the location of the replication data
setwd("")

# Define US regions
northeast <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire",
               "Rhode Island", "Vermont", "New Jersey", "New York",
               "Pennsylvania")
midwest <- c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin", "Iowa",
             "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota",
             "South Dakota")
south <- c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina",
           "South Carolina", "Virginia", "District of Columbia", "West Virginia",
           "Alabama", "Kentucky", "Mississippi", "Tennessee", "Arkansas",
           "Louisiana", "Oklahoma", "Texas")
west <- c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", "New Mexico",
          "Utah", "Wyoming", "Alaska", "California", "Hawaii", "Oregon",
          "Washington")

# CITIZEN DESCRIPTIVES

# Load in population-level numbers for quotas, obtained from the survey 
# firm Dyanta
BE_pop <- read_excel("Data_Raw/Citizen Survey ORD-727620-B4X7 overview.xlsx", sheet = "Belgium")
CL_pop <- read_excel("Data_Raw/Citizen Survey ORD-727620-B4X7 overview.xlsx", sheet = "Chile")
DK_pop <- read_excel("Data_Raw/Citizen Survey ORD-727620-B4X7 overview.xlsx", sheet = "Denmark")
US_pop <- read_excel("Data_Raw/Citizen Survey ORD-727620-B4X7 overview.xlsx", sheet = "USA")

X_raw <- bind_rows(tibble(BE_pop, country = "Belgium"),
                   tibble(CL_pop, country = "Chile"),
                   tibble(DK_pop, country = "Denmark"),
                   tibble(US_pop, country = "US"))

# The population data are messy data, with unnecessary columns...
# So keep the group name, the population percentage, and country name
# Rename variables as required from the weird Excel column names
X1 <- X_raw[, c(2, 3, 9)] %>% rename("group" = "...2", "pop_percentage" = "Percentage...3")
X1$pop_percentage[X1$pop_percentage == "Percentage"] <- NA
X1$pop_percentage <- as.numeric(X1$pop_percentage)

X2 <- X_raw[, c(7, 8, 9)] %>% rename("group" = "...7", "pop_percentage" = "Percentage...8")

X <- bind_rows(X1, X2) %>% filter(!is.na(pop_percentage))

# Regions in Denmark have a "Region " in front of them. Remove the "Region " bit
X$group <- str_replace_all(X$group, "^Region ", "")


# Function to extract the gender, age, and education variables from the
# population data and aggregate them for the relevant tables
extract_pop_values <- function(data, country) {

    Pop <- tibble(group = c("Gender: Men", "Gender: Women",
                            "Age: 18-24", "Age: 25-34", "Age: 35-44",
                            "Age: 45-54", "Age: 55-64", "Age: 65+",
                            "Education: Low", "Education: Middle", "Education: High"),
                  country = country, pop_percentage = NA)

    men_groups <- c("M18-24", "M25-34", "M35-44", "M45-54", "M55-64", "M65+")
    women_groups <- c("F18-24", "F25-34", "F35-44", "F45-54", "F55-64", "F65+")
    Pop$pop_percentage[Pop$group == "Gender: Men"] <- sum(data$pop_percentage[data$group %in% men_groups])
    Pop$pop_percentage[Pop$group == "Gender: Women"] <- sum(data$pop_percentage[data$group %in% women_groups])

    age_group_18_24 <- c("M18-24", "F18-24")
    age_group_25_34 <- c("M25-34", "F25-34")
    age_group_35_44 <- c("M35-44", "F35-44")
    age_group_45_54 <- c("M45-54", "F45-54")
    age_group_55_64 <- c("M55-64", "F55-64")
    age_group_65_plus <- c("M65+", "F65+")

    Pop$pop_percentage[Pop$group == "Age: 18-24"] <- sum(data$pop_percentage[data$group %in% age_group_18_24])
    Pop$pop_percentage[Pop$group == "Age: 25-34"] <- sum(data$pop_percentage[data$group %in% age_group_25_34])
    Pop$pop_percentage[Pop$group == "Age: 35-44"] <- sum(data$pop_percentage[data$group %in% age_group_35_44])
    Pop$pop_percentage[Pop$group == "Age: 45-54"] <- sum(data$pop_percentage[data$group %in% age_group_45_54])
    Pop$pop_percentage[Pop$group == "Age: 55-64"] <- sum(data$pop_percentage[data$group %in% age_group_55_64])
    Pop$pop_percentage[Pop$group == "Age: 65+"] <- sum(data$pop_percentage[data$group %in% age_group_65_plus])

    Pop$pop_percentage[Pop$group == "Education: Low"] <- data$pop_percentage[data$group == "Low"]
    Pop$pop_percentage[Pop$group == "Education: Middle"] <- data$pop_percentage[data$group == "Middle"]
    Pop$pop_percentage[Pop$group == "Education: High"] <- data$pop_percentage[data$group == "High"]

    categories_processed <- c(men_groups, women_groups,
                              age_group_18_24, age_group_25_34, age_group_35_44,
                              age_group_45_54, age_group_55_64, age_group_65_plus,
                              "Low", "Middle", "High")

    Regions <- filter(data, !group %in% categories_processed)
    Regions$country <- country
    Regions$group <- str_c("Region: ", Regions$group)

    Pop <- bind_rows(Pop, Regions)

    return(Pop)

}

# Pull out the population values from each country
Pop_Values_Belgium <- extract_pop_values(filter(X, country == "Belgium"), country = "Belgium")
Pop_Values_Chile <- extract_pop_values(filter(X, country == "Chile"), country = "Chile")
Pop_Values_Denmark <- extract_pop_values(filter(X, country == "Denmark"), country = "Denmark")
Pop_Values_US <- extract_pop_values(filter(X, country == "US"), country = "United States")

# Aggregate states to regions in the United States
# First remove "Region: " from the current state names, e.g. "Region: Alaska"
Pop_Values_US$group[str_detect(Pop_Values_US$group, "^Region: ")] <- str_replace_all(Pop_Values_US$group[str_detect(Pop_Values_US$group, "^Region: ")], "^Region: ", "")

Pop_Values_US$region <- NA
Pop_Values_US$region[Pop_Values_US$group %in% northeast] <- "Region: Northeast"
Pop_Values_US$region[Pop_Values_US$group %in% midwest] <- "Region: Midwest"
Pop_Values_US$region[Pop_Values_US$group %in% south] <- "Region: South"
Pop_Values_US$region[Pop_Values_US$group %in% west] <- "Region: West"
Pop_Values_US$group[!is.na(Pop_Values_US$region)] <- Pop_Values_US$region[!is.na(Pop_Values_US$region)]

# Aggregate from state to region
Pop_Values_US <- Pop_Values_US %>%
                 group_by(country, group) %>%
                 summarize(pop_percentage = sum(pop_percentage))

# Bind all the country population data together
Population_Values <- bind_rows(Pop_Values_Belgium, Pop_Values_Chile,
                               Pop_Values_Denmark, Pop_Values_US)

write_rds(Population_Values, "Data/Citizen_Population_Values.rds")




# POLITICIAN DESCRIPTIVES

# Denmark population numbers
DK_pop <- read_csv("Data_Raw/DK Pop Data.csv")

# Chile population numbers
CL_Pop <- read_csv("Data_Raw/Chile_Background_data.csv")

names(CL_Pop)[names(CL_Pop) == "Genero"] <- "gender"
names(CL_Pop)[names(CL_Pop) == "Party"] <- "party"
names(CL_Pop)[names(CL_Pop) == "Coalition"] <- "coalition"

CL_Pop$gender[CL_Pop$gender == "Masculino"] <- "Male"
CL_Pop$gender[CL_Pop$gender == "Femenino"] <- "Female"

CL_Pop$coalition[CL_Pop$coalition %in% c("Others Center", "Others LW", "Others RW")] <- "Other"
CL_Pop$party <- CL_Pop$coalition

CL_Pop$country <- "Chile"



# United States & Belgian population numbers
# added manually below:
# from CivicPulse (US) and from Karolin (who fielded the survey containing the
# Belgian sub-module)
Population_Values <- bind_rows(DK_pop, CL_Pop)

Party_ID_Pop <- Population_Values %>%
                filter(!is.na(party)) %>%
                group_by(country, party) %>%
                summarize(count_collected = n()) %>%
                ungroup() %>%
                group_by(country) %>%
                mutate(percentage = count_collected / sum(count_collected) * 100) %>%
                mutate(group = str_c("Party ID: ", party)) %>%
                select(group, country, percentage) %>%
                ungroup()

us_democrats_percent <- NA
us_republicans_percent <- NA
us_other_percent <- NA

be_cdv_percent <- 22
be_groen_percent <- 6
be_nva_percent <- 20
be_openvld_percent <- 11
be_pvda_percent <- 0
be_vooruit_percent <- 7
be_vlaamsbelang_percent <- 6
be_other_percent <- 28

Party_ID_Pop <- Party_ID_Pop %>%
              add_row(group = "Party ID: Democratic Party", country = "United States", percentage = us_democrats_percent) %>%
              add_row(group = "Party ID: Republican Party", country = "United States", percentage = us_republicans_percent) %>%
              add_row(group = "Party ID: Other", country = "United States", percentage = us_other_percent) %>%
              add_row(group = "Party ID: CD&V", country = "Belgium", percentage = be_cdv_percent) %>%
              add_row(group = "Party ID: Groen", country = "Belgium", percentage = be_groen_percent) %>%
              add_row(group = "Party ID: N-VA", country = "Belgium", percentage = be_nva_percent) %>%
              add_row(group = "Party ID: Open Vld", country = "Belgium", percentage = be_openvld_percent) %>%
              add_row(group = "Party ID: PvdA", country = "Belgium", percentage = be_pvda_percent) %>%
              add_row(group = "Party ID: Vooruit", country = "Belgium", percentage = be_vooruit_percent) %>%
              add_row(group = "Party ID: Vlaams Belang", country = "Belgium", percentage = be_vlaamsbelang_percent) %>%
              add_row(group = "Party ID: Other", country = "Belgium", percentage = be_other_percent)

Gender_Pop <- Population_Values %>%
              filter(!is.na(gender)) %>%
              group_by(country, gender) %>%
              summarize(count_collected = n()) %>%
              ungroup() %>%
              group_by(country) %>%
              mutate(percentage = count_collected / sum(count_collected) * 100) %>%
              mutate(group = recode(gender, "Male" = "Gender: Men", "Female" = "Gender: Women")) %>%
              select(group, country, percentage) %>%
              ungroup()

be_women_percent <- 34.1
be_men_percent <- 65.9
us_women_percent <- NA
us_men_percent <- NA

Gender_Pop <- Gender_Pop %>%
              add_row(group = "Gender: Women", country = "Belgium", percentage = be_women_percent) %>%
              add_row(group = "Gender: Men", country = "Belgium", percentage = be_men_percent) %>%
              add_row(group = "Gender: Women", country = "United States", percentage = us_women_percent) %>%
              add_row(group = "Gender: Men", country = "United States", percentage = us_men_percent)

# Note that the US population numbers come from the survey firm, Civic Pulse,
# and are manually inputted into Table C10 of the Appendix
# Hence why they are NAs in this dataset
Population_Values <- bind_rows(Gender_Pop, Party_ID_Pop)

write_rds(Population_Values, "Data/Politician_Population_Values.rds")

